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BLOCKS ARE TRANSPOSED IN ASCENDING 
ORDER OF BLOCK NUMBERS. IN THE CASE 
OF SQUARE BLOCK 1. THE LOWER TRIANGLE 
IS COPIED INTO A CONTINUOUS AREA. IS 
TRANSPOSED INTO ROWS BY ACCESSING 
IN THE DIRECTION OF ROW AND IS 
STORED IN THE UPPER TRIANGLE IN 
EACH OF THE CASES OF SQUARE 
BLOCKS 2 THROUGH 8. EACH SQUARE 
IN THE FIRST COLUMN, IS COPIED 
AND TRANSPOSED INTO THE 
CORRESPONDING SQUARE IN THE 
FIRST ROW. 
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SUB-ROUTINE FOR TR I -D I AGONAL I Z I NG A REAL SYMMETRIC MATRIX 
subrout i ne tr i d (a, k, n, d i ag. sd i ag) 

! STORE THE LOWER TRIANGLE OF THE REAL SYMMETRIC MATRIX IN a. 

! STORE daig AND sdaig THE DIAGONAL AND SUB-DIAGONAL PORTION OF 

THE TR I -DIAGONAL MATRIX. INFORMATION NEEDED FOR CONVERSION IS STORED 

IN THE LOWER TRIANGLE OF a. 

constant iblk^' set block width' 

shared array a(k, n) . diag(n) . sdiag(n) 

allocate shared array u(n+1, ibik). v(n+1, ibik) 

! U STORES BLOCKS TO BE TR I -DIAGONAL I ZED. AND V IS AN AREA FOR STORING W. 

create threads 

create threads 

set nothrd and numthrd 

nothrd IS A NUMBER FOR EACH THREAD. 1-#TH. numthrd=#TH 

(TOTAL NUMBER OF THREADS) 

nb=(n-2+iblk-l)/iblk 

nbase=0 

do nb-1 

nbase=(i-1)*iblk 

istart=1 

nwidth=iblk 

ca 1 1 copy (a, k. n. nbase. nothrd, numthrd) 
c copy 

u (nbase+1 : n, 1 : i b I k) ^a (nbase+1 : n. nbase+1 : nbase+ i b I k) 
ca 1 1 b I ktr i d (a, k, n, d i ag. sd i ag. nbase, i start, nw i dth. 

u. V, nothrd. numthrd) ! PERFORM LU DECOMPOSITION IN 

PARALLEL. 

copy back 

a (nbase+1 : n. nbase+1 : nbase+ i b I k) (nbase+1 : n. 1 : i b I k) 
ca 1 1 update (a. k. n. nbase. nw i dth, u. v. nothrd. numthrd) 
enddo 

nbase= (nb-1) *ib Iks 

istart=1 

nwidth=n-nbase 

cal I biktr id(a, k. n. diag. sdiag. nbase. istart. nwidth. 
u, V. nothrd. numthrd) 

return 
end 
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EACH BLOCK MATRIX IS CALLED RECURSIVELY IN A TR 1 -D I AGONAL I ZAT I ON ROUTINE, 
nbase IS AN OFFSET INDICATING THE POSITION OF A BLOCK, i start IS AN OFFSET 
IN THE BLOCK OF REDUCED SUB-BLOCK TO BE CALLED RECURSIVELY. AND INDICATES 
THE POSITION OF A TARGET SUB-BLOCK. IT IS SET TO 1 WHEN CALLED FOR THE FIRST 
TIME. 

nwidth REPRESENTS THE SIZE OF A SUB-BLOCK. 

^ubrout i ne b I ktr i d (a, k, n. d i ag. sd i ag, nbase. i start, nw i dth. 

u. V, nothrd. numthrd) 
shared array a (k. n) . diag(n) . sdiag(n) . u (n+1 . *) , v (n+1, *) 

if (nwidth<10) then 

ca 1 1 btun i t (a, k. n. d i ag. sd i ag. nbase. i start, nwi dth. 
u, V. nothrd. numthrd) 

else 

istart2^i start 
nwidth2*-nwidth/2 

ca 1 1 b I ktr i d (a. k. n. d i ag. sd i ag. nbase. i star t2. nw i dth2. 
u. V, nothrd. numthrd) 

BARRIER SYNC 

i starts^ i start+nwi dth/2 

nw i dthS^nw i dth-nw i dth/2 

is2^istart2 

i e2^ i start+nw i dth2-l 

is3^i starts 

i e3^ i start3+nw i d3-l 

iptr^nbase+istar3 

I en*- (n- i ptr+numthrd-1 ) /numthrd 

i s^ i pt r+ (nothrd-1 ) * I en+1 

ie^min(n. iptr+nothrd*len) 

u(is: ie. is3: ie3)^u(is: ie, is3: ie3) 

-u(is: ie. is2: ie2)*w(is3: ie3. is2: ie2)^ 

-W(is:ie. is2: ie2)*U(is3: ie3. is2:ie2)* 

BARRIER SYNC 

call biktr id (a. k. n. diag. sdiag. nbase, 1 starts, nwi dthS. 
u. V. nothrd. numthrd) 

end if 

return 

end 



F I G. 1 3 



PARAIiLEL PROCESSING METHOD ... Oct,. 2, 2003 

Makoto Nakanishi 

Greer, Burns & Craln, Iitd. (Patrick Burns) 
Re£. No. 1503.68506 

Sheet 14 of 29 <312) 360 0080 



c INTERNAL ROUTINE OF biktrid 

subroutine btunit(a,k, n, diag, sdiag, nbase, i start, nwidth, 

u, V, nothrd, numthrd) 
shared :: a (k. n) , diag(n) , sdiag(n) , u (n+1, ♦) , v (n+1, *) 
shared :: tmp(numthrd) , sigma, alpha 
i f (nbase+ i sta rt>n-2) then 
return 
end if 

do i=istart. istart-l+nwidth 
iptr2*-nbase+i 

I en^ (n- i ptr 2+numthrd-1 ) /numthrd 
i s*- i ptr2+ (nothrd-1) * I en+1 
ie*-niln(n, iptr2+nothrd*len) 
BARRIER SYNC 

tmp(nothrd)^u(is: ie, i)**u(is: ie, i) 
BARRIER SYNC 
if (nothrd=l)then 

s igma^sqrt (sum (tmpd: numthrd))) ! SUM IS TO SUM, AND sqrt IS TO EXTRACT 
A SQUARE ROOT. 

diag(lptr2)<^u(jptr2, i) 
sdiag(iptr2) < — sigma 

u(nbase+i+1, i) (nbase+i+1, i)+slgn(u{nbase+i+l, i)*sigma 
alpha=l. 0/(sigma*u(nbase+l+l, D) 
u(iptr2. i)=alpha 
end if 

BARRIER SYNC 
iptr3=iptr2+l 
v(is: ie, i) 

^A(iptr3:n, iptr2+is: iptr2+ie)**u{iptr3:n, i) 
BARRIER SYNC 

I en2-^ ( i -1 +numthrd-l) /numthrd 
i sx*^ (nothrd-1 ) * I en2+l 
iex'^min(r-l, nothrd*len2) 

u(n+l, isx: lex) ^u (nbase+i+1 :n, isx: iex)^*u (i+1 :n, 1) 
V (n+1 , i sx : i ex) ♦-v (nbase+ i +1 : n, i sx : 1 ex) **u (i +1 ■ n, 1) 
BARRIER SYNC 

v(ls: ie, i)^alpha*(v(is: ie, i)-v(is: ie, 1 : i-1)*u(n+1, 1 : i-1)^ 
-u(is:ie, l:i-l)*v(n+l,l:i-1)* ) 

BARRIER SYNC 

tmp (nothrd) -^v (i s : i e, i ) ^*u ( i s : i e, i ) 
BARRIER SYNC 
if (nothrd=l)then 

beta-^0. 5*a I pha*sum (tmp (1 : numthrd) ) 
end if 

BARRIER SYNC 

v(is: ie, i)^v(is: Ie, i ) -beta*u ( i s : ie, i) 
BARRIER SYNC 
if (i<iblk )then 
if (ptr2<n-2)then 

u(is: ie, i+l)^u(is: ie, i+l)-u(is: ie, i start: i)*v(i+1, i start: j)* 

-v(is: ie, istart: i)*U(i+l, istart: i)* 

else 

u (is: ie, i+1 : i+2)*— u(is: ie, i+1 : i+2)-u (is: ie, istart: i)*v(n-1 :n, istart: I)* 

-V ( i s : i e, i start : i ) *U (n-1 : n, i start : i ) ^ 

return 
end if 
end if 
enddo 

el iminate threads 



return 
end 
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ROUTINE FOR UPDATING THE LOWER HALF OF A MATRIX BASED ON u AND v 

nbase IS AN OFFSET INDICATING THE POSITION OF A BLOCK, nwidth REPRESENTS BLOCK 

WIDTH. 

subrour ine update (a, k. n, nbase. nwidth. u, v, nothrd. numthrd) 
shared array a(k. n). u(n+l. *). v(n+l. *) 

BARRIER SYNC 

blk^nwidth 

nbase2^nbase+nw i dth 

I en^ (n-nbase+2*numthrd-1) / (2*numthrd) 

i s I ^nbase+ (nothrd-1) * I en+1 

iel^min(n, nbase+nothrd*len) 

nbase3^nbase+2*numth r d* I en 

i sr -^nbaseS-nothrd* I en+1 

ier^minCn. isr+len-1) 

a(iel+1 :n, is! : iel) 

^a(iel+1:n. isl : iel)~w(iei+1 :n. 1 :blk)*u(isl : iel. 1 :blk)^ 
-u(iel+l:n. l:blk)*w(isl:ieM:blk)* 

a (ier+1 :n, isr : ier) 

^a(ier+1 :n. isr: iel)-w(ier+1 :n. 1 :blk)*u(isr : ier. 1 :blk)* 
-u(ier+1:n. 1:blk)*w(isr: iel. 1:blk)* 

ca 1 1 trupdate (a. k. n. i s I , i e I , u. v. b I k) 

cal I trupdate (a. k. n, isr. ier. u. v, bik) 

BARRIER SYNC 

return 

end 

UPDATE OF DIAGONAL MATRIX PORTION 

subroutine trupdate (a. k, n. is. ie. u. v. bIk) 

constant blk2^BL0CK WIDTH FOR DIAGONAL BLOCK UPDATE 

shared array a(k. n). u(n+1. *), v(n+l, *) 

do i = is. ie. blk2 
is2^i 

ie2^min(i+blk2-1. ie) 

a(is2: ie. is2: ie2) 

^a(is2: ie. is2: ie2)-w(is2: ie. 1. blk)*u(is2: ie2. 1 :blk)t 
-u(is2: ie, 1. blk)*w(is2: ie2. 1:blk)t 

enddo 

return 
end 

subroutine copy (a. k. n. nbase. nothrd, numthrd) 

I en^ (n-nbase+2*numthrd-1) / (2*numthrd) 

i s I ^nbase+ (noth rd-1 ) * I en+1 

I en I =max (0, m 1 n (n- i s I +1 , I en) ) 

nbase3^nbase+2*numthrd*len 

i sr*— nbase3-nothrd* I en+1 

lenr=max (0, min(n-isr+1. I en)) 

ca 1 1 bandcp (a. k. n. i s I . I en) 
cal I bandcp (a. k. n. isr. ier) 



return 
end 
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c ROUTINE FOR COPYING THE UPDATED LOWER TRIANGLE INTO THE UPPER TRIANGLE 
subroutine bandcp(a. k. n. is, I en) 
conctant nb^size of small buffer 
private w(nb. nb) 

nn^minCnb, I en) 

I oopx^ ( I en+nn-1 ) / nn 

do j=1, loopx 

ip*-is+(j-1)*nn 

nl^len-(j-1)*nn 

nnx^minCnn. n1) 

Ien2*-n-ip+l 

I oopy^ ( I en2+nnx-1 ) /nnx 

is2=is+(j"1)*nnx 

TRL (w (1 : nnx, 1 : nnx) ) ^TRL (a ( i s2 : i s2+nnx-1 , i s2 : i s2+nnx) ) 
TRU (a ( i s2 : i s2+nnx-1 , i s : i s+nnx) ) *-TRL (w (1 : nnx, 1 : nnx) ) ' 

! TRL AND TRU REPRESENT A LOWER TRIANGLE AND AN UPPER 
TRIANGLE. RESPECTIVELY. 

do i=2. loopy-1 
is3^is2+(i-1)*nnx 

w (1 : nnx, 1 : nnx) -^a ( i s3 : i s3+nnx-1 . i s2 : i s2+nnx) ) 
a ( i s2 : i s2+nnx. i s3 : i s3+nnx-1 ) (1 , nnx : 1 , nnx) * 
enddo 

is3+(loopy-l)*nnx 
ny*-n-is3+l 

w (1 : ny, 1 : nx) ^a ( i s3 : n, i s2 : i s2+nnx) ) 
a ( i s2 : i s2+nnx, i s3 : n) (1 , ny : 1 . nx) * 
enddo 

return 
end 
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ROUTINE FOR CONVERTING THE EIGENVECTOR OF A TR I -DIAGONAL MATRIX 
(STORED IN ev(l:n. 1:nev)) INTO THE EIGENVECTOR OF THE ORIGINAL MATRIX 
a REPRESENTS TR I -D I AGONAL I ZAT I ON OUTPUT AND STORES INFORMATION NEEDED 
FOR CONVERSION IN THE LOWER TRIANGLE. 

subroutine convevCa. k. n. ev. nev) 
shared array a (k. n) . ev (k, n) 

create threads 

set nothrd and numthrd 

c nothrd REPRESENTS THE NUMBER OF EACH THREAD. AND 1-#TH. numthrd=#TH 
(TOTAL NUMBER OF THREADS) 

BARRIER SYNC 

I en^ (nev+numthr d-1 ) /numthrd 

is^(nothrd-1)*len+1 

ie^minCnev, nothrd* I en) 

nevthrd^max ( i e- i s+1 . 0) 

ca 1 1 convevthrd (a. k. n. ev (1 . is), nevthrd) 

BARRIER SYNC 

return 
end 
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ROUTINE FOR CONVERTING AN EIGENVECTOR 
subroutine convevthrdCa. k, n, ev. iwidth) 
constat blk^BLOCK WIDTH 
shared array a (k, n) 
array ev(k. *) 

private w(blk. n). w2(blk. bik) 

if (iwidth<0) then 

return 

end if 

numblk=(n-2+blk-1)/blk 
nf bs^n-2-b I k* (numb I k-1) 
do i=n-2, n-2-nfbs+1.-l 

alpha^-a(i. i) ! alpha IS STORED IN A DIAGONAL ELEMENT AT THE TIME OF 
TRI-DIAGONALIZATION. 

x(l : iwidth)*— ev(i+1 :n. 1 : iwidth)**a (i+1 :n, i) 
ev(i+1 :n, 1 : iwidth)^ 

ev(i+1 :n, 1 : i width) +aipha*a(i+1 :n. i)*x(1 : iwidth)* 
enddo 

do i=1, numb I k-1 
i s<-n-2- (nf bs+ i *b I k) +1 
i e«- i s+b I k-1 
w(1:blk. iwidth) 

a(is+1 :n, is: ie)**ev(is+1 :n, 1 : iwidth) 
w(1 :blk-1. 1 : iwidth) *~w(1 :blk-1, 1 : iwidth) 

+TRL(a(ie+1 : is, is: i e) ) **ev ( i e+1 : is, 1 : iwidth) 
! TRL REPRESENTS A LOWER TRIANGULAR MATRIX. 
DIAG(w2)^DIAG(a(is:ie, is:ie))! DIAGONAL ELEMENT VECTOR OF A DIAG MATRIX, 
do i2=blk. 1.-1 
do il = i2-1.1.-1 
w2(il. i2) = 

^w2(i1. i1)*(a(is+i2:n, is+i2-1)**a(is+i2:n. is+il-D) 

enddo 

enddo 

do i1=blk-1,1.-l 
do i2=blk. i 1+1.-1 

w2(i1. i2)^w2(i1. i2)+w2(i1. 11+1 : i2-1)*w2(i1+1 : i2-1. 12) 

enddo 

enddo 

do i2=blk. 1.-1 
do i1 = i2-1.1.-1 

w2(i1. i2)^w2(il, i2)*w2(i2. i2) 

enddo 

enddo 

w(1:blk, 1: iwidth) — 

w (1 :b Ik. 1: iwidth)+TRU(w2)*w(1:b Ik. 1: iwidth) ! TLU REPRESENTS AN UPPER TRIANGLE 

MATRIX. 

ev(is+1 :n. 1 : iwidth) 

— a(is+1 :n, is: ie)*w(1 :blk. 1 : iwidth) 

ev(ie+1 : is, 1 : iwidth) 

— TRL(a(ie+1:is. is: ie-1))*w(1 :blk-1. 1 : iwidth) 
enddo 



FIG. 18 



return 
end 
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SUBROUTINE trid FOR 
TRI-DIAGONALIZING A REAL 
SYMMETRICAL MATRIX 
I 



INPUT shared ARRAYS. A(k.n). diad(n) AND sdiag(n) AS 
SUBROUTINES, diag. sdiag RETURN THE DIAGONAL AND SUB- 
DIAGONAL ELEMENTS OF A CALCULATED TR I -DIAGONAL MATRIX AS 
OUTPUT. WORK AREAS U(n+1. ibik) AND v(n+l. ibik) ARE RESERVED 
IN THE ROUTINE AND ARE USED AS A shared ATTRIBUTE. 



/ 



S10 



GENERATE THREADS. 

SET THE TOTAL NUMBER OF THREADS IN A LOCAL AREA numthr FOR 
EACH THREAD AND THREAD NUMBERS ASSIGNED TO EACH THREAD IN 

nothrd. 

SET THE FOLLOW I NGS IN EACH THREAD. 

SET BLOCK WIDTH IN ibik. 

SET nb=(n-2+iblk-1)/iblk. nbase=0 AND i=1. 



/ 



S11 
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S13- 



SET nbase =(i-1)*iblk. istart=l 
AND nwidth=iblk. 



SET nbase=(nb-1)*iblk. istart=1 
AND i b I k2=n-nbase. 



S14- 



S15 



CALL A 
''SUBROUTINE COPY^ 
AND COPY THE 
LOWER TRIANGLE 
IN THE UPPER 
TRIANGLE. 



S20 



COPY A BLOCK TR I -D I AGONAL 1 ZAT I ON 
TARGET AREA IN WORK AREA U. 
U (nbase+1 : n. 1 : nw i dth) ^ 
A(nbase+1 :n. nbase+1 :n) 



COPY A BLOCK TR I -DIAGONAL I ZAT I ON 
TARGET AREA IN WORK AREA U. 
U (nbase+1 :n, 1 : ibik) ^ 
A (nbase+1 : n, nbase+1 : nbase+ ibik) 




RETURN THE TR I -DIAGONAL I ZED 
AREA TO ARRAY A. 

A (nbase+1 : n, nbase+1 : n) 
U (nbase+1 :n. 1 :nwidth) 



RETURN THE TR I -D I AGONAL I ZED 
AREA TO ARRAY A. 
A (nbase+1 : n. nbase+1 : nbase+ i b I k) ^ 
U (nbase+1 :n, 1 : ibik) 



823 



DELETE THE THREADS GENERATED 
FOR THE PARALLEL PROCESSING. 



y 



SIS- 



CALL A 
'SUBROUTINE UPDATE> 

AND UPDATE THE 
LOWER TRIANGLE OF 
A(nbase+iblk:n, 
nbase+iblk:n). 



^ return^ 



FIG. 19 
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SUBROUTINE biktrid 
(RECURSIVE PROGRAM) subroutine biktrid 
(A, k, n, d i ag, sd i ag, nbase, i start, 
nwidth, U, V, nothrd, numthrd), WHERE nbase IS 
AN OFFSET INDICATING THE POSITION OF A BLOCK, 
i start IS AN INTRA-BLOCK OFFSET OF A REDUCED 
SUB-BLOCK TO BE RECURSIVELY USED AND 
INDICATES THE POSITION OF THE TARGET 
SUB-BLOCK, WHICH IS SET TO "1" WHEN 
CALLED FOR THE FIRST TIME AND nwidth 
REPRESENTS ITS BLOCK WIDTH. 



S25 
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CHANGE AN UPDATE POSITION AND A BLOCK WIDTH WHICH 
ARE USED FOR RECURSIVE CALLING, SET istart2=i start 
AND nwidth2=nwidth/2 AND TRANSFERS THEM. 
TRANSFER THE START POSITION AND WIDTH OF THE REDUCED 
BLOCK. 



S28 




S29- 



APPLY Barrier SYNCHRONIZATION 
BETWEEN THREADS. 



S30- 



S31 



CALCULATE START A POSITION (is2, is3) AND AN END POSITION 

(ie2. ie3), WHICH ARE SHARED WITH EACH THREAD IN UPDATE. 

i sta r t3= i sta r t+nw i dth/2, nw i dth3=nw i dth-nw i dth/2, 

i s2= i start2, i e2= i start+nw i dth2-1 , 

i s3= i starts, i e3= i start3+nw i dth3-1 

i ptr=nbase+ i starts, 

I en= (n- i ptr+numthrd-1) /numthrd, 

is=iptr+(nothrd-1)*len+1. ie=min(n, iptr+nothrd*len) 

I 



U(is: ie, is3: le3)=U(is: ie, is3: Ie3)-U(is: ie, is2: ie2)*W(is3: ie3, is2: ie2)' 

-W(is: ie, is2: ie2)*U(is3: ie3. is2: ie2)* 



S32- 



I 



APPLY barrier SYNCHRONIZATION 
BETWEEN THE THREADS. 



S33 



FIG. 2 0 
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^SUBROUTINE btunit (INTERNAL ROUTINE OF biktrid^ 
^ btun i t (A. k. n. d i ag. sd i ag. nbase, i sta r t, nw i dth, U, V, 
nothrd. numthrd) 



ASSIGN tmp (numthrd). Sigma AND alpha 
ACCORDING TO ITS shared ATTRIBUTE. 



S35 
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return ) 



FIG. 2 1 
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VI 



CALCULATE AN START POSITION (is) 

AND AN END POSITION (ie), 

WHICH ARE SHARED WITH EACH THREAD. 

i ptr2=nbase+ i , I en= (n- i ptr2+numthrd-1) /nunrthrd, 

i s= i pt r2+ (nothrd-1 ) * I en+1 , 

ie=min(n, iptr2+nothrd*len) 

I 




APPLY barrier SYNCHRONIZATION. 



tmp(nothrd)=U(is: ie. i)**U(is:ie. i) 



APPLY barrier SYNCHRONIZATION, 




S44 



S45 



CALCULATE THE SQUARE ROOT OF THE SUM OF VALUES 

PARTIALLY CALCULATED IN EACH THREAD AND 

TR I -DIAGONAL I ZE THE SQUARE ROOT (GENERATE A 

HOUSEHOLDER VECTOR) 

sigma=sqrt (sum(tmp(1 : numthrd))) !, 

WHERE "SUM" AND sqrt REPRESENT SUM AND SQUARE 

ROOT, RESPECTIVELY. diag(iptr2)=u(iptr2. i). 

sd i ag ( i ptr2) =-s i gma. 

U(nbase+i+1. i)=U(nbase+i+1. i)+sign(u(nbase+i+1, i) 
*s 1 gma a I pha=1 . 0/ (s i gma*u (nbase+ i +1 . i ) . 
U (iptrZ, i)=alpha 



S46- 



APPLY barrier SYNCHRONIZATION. 



S47- 



iptr3=iptr2+l 



S48 



© 



T 



V(is:ie, i)=A(iptr3:n. Iptr2+is: iptr2+ie)* 

U(iptr3:n. i) 



© 
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0 



APPLY barrier SYNCHRONIZATION. 



S49 



VCis: ie, i)=alpha*(V(is: ie, i) 
-V(is: ie, 1: i-1)*(U(iptr3:n. 1 : i-D* 
*U(iptr3:n, i)) 

-U(is: ie. 1:i-1)*(V(iptr3:n. 1: i-D* 
*U(iptr3:n. i))) 



S50 



APPLY barrier SYNCHRONIZATION. 



tmp(nothrd)=V(is: ie. i)* *U(is: ie, i) 



APPLY barrier SYNCHRONIZATION. 




S54 



beta=0. 5*alpha*sum 
(tmpd :numthrd)), 
WHERE sum REPRESENTS 
THE SUM OF VECTORS. 



APPLY barrier SYNCHRONIZATION. 



V(is: ie. i)=V(is: ie, i ) -beta*U ( i s : ie, i) 



APPLY barrier SYNCHRONIZATION. 
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U(is: ie, 1+1) 

=U(is: ie, i+l)-U(is: ie, istart: I)* 
V(i+1. istart: i)* -V(is:ie, istart: i) 
*U(n+1, istart: i)* 
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S57 



S58 
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0 



U(is: ie, 1+1 : i+2) 

=U(is: ie, i+1 : i+2)-U(is: ie. istart: i)* 
V(i+1:n, istart: i)^ -V(is:ie, istart: i) 

*U(n+1 :n, istart: i)^ 



return ^ ^return^ 



FIG. 2 2 
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C 



SUBROUT 



INE update j 
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APPLY barrier SYNCHRONIZATION. 



MAKE A PAIR IN EACH THREAD AND DETERMINE START AND END 
POSITIONS, WHICH ARE SHARED WITH EACH THREAD IN UPDATE. 
nbase2=nbase+iblk 

I en= (n-nbase2+2*numthrd-l) / (2*numthrd) , 
is1=nbase2+(nothrd-1) len+1, ie1=min(n, nbase2+nothrd*len), 
nbase3=nbase2+2*numthrd* I en, 1 sr=nbase3-nothrd* I en+1 . 
ier=min(n, isr+len-1) 



S67- 



A(ie1+1:n, isl : ie1)=A(ie1+1 :n, is1:iel)- 



W(ie1+1:n, 1:blk)*U( 
U(ie1+1:n. 1:blk)*W( 
A(ier+1 :n, isr: ier)=A( 
W(ier+1:n. 1:blk)*U( 



s1:ie1,1:blk)^ 
si: lei, l:blk)* 
er+1 :n, isr: ier)- 
sr: ier, 1 :blk)^ 



U(ier+1:n, l:blk)*W(isr: ier. l:blk)* 
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SUBROUTINE trupdate 
(UPDATE OF A DIAGONAL MATRIX) INPUT UPDATE 
START POSITION is AND UPDATE END POSITION 
le. WHICH ARE USED TO UPDATE A RECTANGLE 
LOCATED UNDER THE DIAGONAL BLOCK BEFORE THE 
SUBROUTINE IS CALLED. 



S75 



SET BLOCK WIDTH FOR A DIAGONAL 
BLOCK IN blk2. 
SET i=is. 





S77 ^^"""Yn"^^ 


DETERMINE START AND END POSITIONS 




IN EACH THREAD. 




is2=i, ie2=min(i+blk2 -1. ie-1) 




a(is2: ie-1. is2. Ie2)= a(ls2: ie-1. is2, ie2) 




-U(is2: ie-1.l:blk)*W(is2:ie2. l:blk)* 




-W(is2: ie-1, 1 :blk)*U(is2: ie2. 1 :blk)* 





S78 



i=i+blk2 



(Return 



) 



FIG. 2 4 
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c 



SUBROUTINE copy 



S80 



3 



CALCULATE A START POSITION AND WIDTH USED 

TO EXECUTE COPYING IN PARALLEL AFTER MAKING 

A PAIR IN EACH THREAD. 

I en= (n-nbase+2*numthrd-1 ) / (2*niBnthrd) . 

I si =nbase+ (nothr d-1 ) * I en+1 , 

Ien1= max(0. min(n-is1+1. I en)). 

nbase3=nbase+2*numthrd* I en. 

i sr=nbase3-nothr d* I en+1 . 

I enr=max (0. m i n (n- i sr +1 . I en) ) 



SSI 



GALL A 
SUBROUTINE bandcp. 
COPY AN AREA. WHICH IS 
DETERMINED BY A START 
POSITION isl AND WIDTH 
leni ON THE LEFT SIDE 
OF THE PAIR. 



S82- 



CALL A 
SUBROUTINE bandcp. 
COPY AN AREA, WHICH IS 
DETERMINED BY A START 
POSITION isr AND WIDTH 
lenr ON THE RIGHT SIDE 
OF THE PAIR. 



return 



FIG. 2 5 
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ON 



SUBROUTINE bandcp 
COPY AN AREA WHILE TRANSPOSING THE AREA 
A CACHE. USING A SMALL WORK AREA WX. WX(nb. nb) 
RECEIVE A START POSITION AND WIDTH IN 
is AND I en. RESPECTIVELY. 



S85 



nn=m i n (nb . I en) . I oopx= ( I en+nn-1 ) /nn . j =1 




DETERMINE THE SIZE nnx AND ITS OFFSET ip OF A 
DIAGONAL BLOCK TO BE COPIED TO WX. 
ip=is+(j-1)*nn, n1=len-(j-l)*nn, nnx=min(nn, n1), 
I en2=n- i p-nnx+1 , I oopy= ( I en2+nn-1 ) /nn 
TRL (WX (1 : nnx. 1 : nnx) ) =TRL (A ( i p : i p+nnx-1 . i p : i p+nnx-1 ) ) , 
TRU (A ( i p : i p+nnx-1 . i p : i p+nnx-1 ) ) =TRL (WX (1 : nnx. i : nnx) ) 
(. WHERE TRU AND TRL REPRESENT UPPER AND LOWER 
TRIANGLES. RESPECTIVELY) 
i=1, is2=ip, is3=ip+nnx 




TRANSPOSE AND COPY nnx nnx 

WX(1 :nn. 1 :nnx)=A(is3: is3+nn-l. is2: is2+nnx-1). 

A ( i s2 : i s2+nnx-1 . i s3 : i s3+nn-1 ) =WX (1 . nn : 1 , nnx) ^ 

is3=is3+nn 



890 



COPY THE LAST 


PART. 


nn=n- i s3+1 




WX(1:na1:nx) 




=A(is3:n, is2 


: is2+nnx-1)) 


A(is2: is2+nnx- 


-1, i$3:n) 


=WX(1:nn, 1:nx) 



(^return^ 



FIG. 2 6 
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SUBROUTINE convev 
THE NUMBER nev OF EIGENVECTORS TO BE 
CALCULATED AND A HOUSEHOLDER VECTOR ARE 
STORED IN THE LOWER HALF. THE EIGENVECTORS OF 
A TR I -DIAGONAL MATRIX ARE STORED IN 
ev (k, nev) . 



S95 



GENERATES THREADS. SET THE TOTAL NUMBER OF 
THREADS AND THEIR NUMBERS (1 THROUGH numthrd) IN 
THE numthr AND nothrd. RESPECTIVELY. OF THE 
LOCAL AREA OF EACH THREAD. 



S96 
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APPLY barrier SYNCHRONIZATION. 



DETERMINE START AND END POSITIONS. WHICH ARE 
SHARED WITH AND CALCULATED IN EACH THREAD. 
I en= (nev+nymthr d-1 ) /numthrd. 
is=(nothrd-1)*len+1. ie=inin(nev. nothrd*len) 
width=ie-is+1 



T 



S98 



CALL A SUBROUTINE 
convevthrd AND CONVERTS 
THE EIGENVECTORS OF THE 
TR I -DIAGONAL MATRIX INTO 
THOSE OF THE ORIGINAL MATRIX. 
TRANSFER AN AREA WHERE EIGEN- 
VECTORS SHARED WITH EACH 
THREAD ARE STORED AND THE 
NUMBER width OF THE 
EIGENVECTORS. 



S99 
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APPLY barrier SYNCHRONIZATION. 



DELETE THE GENERATED THREADS. 



FIG. 2 7 
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^SUBROUTINE convevthrd CONVERT THE EIGENVECTORS OF^ 
/ THE TR I -DIAGONAL MATRIX, WHICH ARE SHARED WITH EACH 

THREAD INTO THOSE OF THE ORIGINAL MATRIX. A VECTOR AND 
\ A COEFFICIENT THAT RESTORE HOUSEHOLDER CONVERSION ARE 
STORED IN ARRAY A. 



I 



SET BLOCK WIDTH IN bik. 

THE BLOCK WIDTH IS APPROXIMATELY 80. 



S110 



S1 11 
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N 



^ return^ 



OBTAIN THE FIRST BLOCK TO BE CONVERTED 
BY SEQUENTIALLY CALCULATING (I + auu*) 
IN THE FOLLOWING LOOP. 

numb I k= (n-2+b I k-1 ) /b I k, nf bs=n-2-b I k* (numb I k-1 ) 



S113 



N 



S115- 



i=l 



S116 




alpha=-a(i, 1). 

S^^'^aJ x(1: iwidth)=a(i+1:n. i)^ * ev (i+1 :n. 1 :width) 
ev(i+1 :n, 1 :width)=ev(i+1 :n, 1 :width) 

+a I pha*a (i +1 : n, i)*x (1 : iwidth)* 




DIVIDE U**EV OF (l+UBU*) IN A BLOCK FORM INTO AN UPPER 
TRIANGLE MATRIX LOCATED AT THE LEFT END OF U* AND A 
RECTANGLE LOCATED ON THE RIGHT SIDE AND CALCULATE THEM 
SEPARATELY. 

1 s=n-2- (nf ns+ i *b I k) +1 , 1 e= i e+b I k-1 , 

w(1 :blk, iwidth)=a (ie+1 :n, is: ie)^ *ev(ie+1 :n, 1 : iwidth) 

w(1:blk-1. 1: iwidth)=w(1:blk-1. 1: iwidth) + 

TRL(a(is+1 : ie, is: ie-1))* *ev(is+1 : ie, 1 : iwidth) 
THEN. CALCULATE B OF (l+UBUt) IN A BLOCK FORM. 
diag(w2)=-diag(a(is: is+blk-1, is: is+blk-1)), i2=blk 
STORE A CORRESPONDING COEFFICIENT a IN w2 (.WHERE TRL(w2) 
= AND diag(x) REPRESENT THE LOWER TRIANGLE MATRIX OF w2 
AND THE DIAGONAL ELEMENT OF x. RESPECTIVELY). 




© 



STORE THE INNER PRODUCT OF THE 
HOUSEHOLDER VECTOR xa IN THE UPPER 
TRIANGLE OF W2. 
11=12-1 




S121 



w2(i1. i2)=w2(i1, i1)* 
(a(is+i2:n, is+i2-1)* 
*a(is+i2:n. is+i1-1)). 
i1=i1-1 

I 



SI 22 



i2=i2-1 



© © 



FIG. 2 8 
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SI 23 



9 



® 



SET i1=blk-2 AND CALCULATE AN EXPANSION 
COEFFICIENT IN A DOUBLE LOOP. DETERMINE THE 
UPPER SIDE OF A TRIANGLE MATRIX FROM RIGHT TO 
LEFT AND CALCULATE IT IN SUCH A WAY AS TO PILE 
IT UP. WHICH CORRESPONDS TO DETERMINING A 
COEFFICIENT BY ADDING EXPANSION OBTAINED BY 
APPLYING HOUSEHOLDER CONVERSION FROM THE LEFT. 




DETERMINE THE ELEMENTS OF 
THE UPPER SIDE FROM LEFT TO RIGHT. 
USE AN IMMEDIATELY PRECEDING 
COEFFICIENT. 
w2(i1. i2) 
=w2(i1. i2)+w2(i1, i1+1: 12-1) 
*w2(i1+1:i2-1. 12) 
i2=i2-1 



SI 28' 



i1=i1-1 



SI 35 



MULTIPLY COEFFICIENT 
a IN THE FOLLOWING 
LOOP WHICH LACKS, 
i 1=12-1 



SI 32 




i2=i2-1 

^ I 

SI 34 



w2(i1. i2) 
=w2(i1. i2)* 
w2(i2. 12) 
i1=i1-1 

SI 33 



CALCULATE BU* AND STORE IT IN W. 
W(1:blk. 1: iwidth)=TRU(w2)*W(1:blk, l: iwidth) 
CALCULATE (l+UBU*)*EV USING A TRIANGLE LOCATED 
IN THE UPPER SECTION, A RECTANGLE LOCATED IN THE 
LOWER SECTION AND BU^ STORED IN W. 
ev(ie+1 :n. 1 :width)=ev(ie+1 :n. 1 :width) + 
a(ie+1 :n, is: ie)*W(1 :blk, 1 :width) 
ev(is+1 : ie, 1 : width) =ev (i s+1 : ie. 1 :width) + 
TRL(a(is+1 : ie, is+1 : ie))*W(1 :blk-1. 1 :width) 



FIG. 2 9 
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